library(ggplot2)
library(ggthemes)
library(readr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(DT)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
library(mapdata)
colors = c("#05a357", "#0576a3", "#d27dfa", "#f26bae", "#f2d56b", "#f05d5d", "#34baad")
apr_data <- read_csv("~/Desktop/Uber-dataset/uber-raw-data-apr14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
may_data <- read_csv("~/Desktop/Uber-dataset/uber-raw-data-may14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
jun_data <- read_csv("~/Desktop/Uber-dataset/uber-raw-data-jun14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
jul_data <- read_csv("~/Desktop/Uber-dataset/uber-raw-data-jul14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
aug_data <- read_csv("~/Desktop/Uber-dataset/uber-raw-data-aug14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
sep_data <- read_csv("~/Desktop/Uber-dataset/uber-raw-data-sep14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
uber_data_2014 <- rbind(apr_data, may_data, jun_data, jul_data, aug_data, sep_data)
uber_data_2014$`Date/Time` <- as.POSIXct(uber_data_2014$`Date/Time`, format = "%m/%d/%Y %H:%M:%S")
uber_data_2014$Time <- format(as.POSIXct(uber_data_2014$`Date/Time`, format = "%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
uber_data_2014$`Date/Time` <- ymd_hms(uber_data_2014$`Date/Time`)
#Creating day, month, year, hour, minute, seconds factors
uber_data_2014$Day <- factor(day(uber_data_2014$`Date/Time`))
uber_data_2014$Month <- factor(month(uber_data_2014$`Date/Time`, label = TRUE))
uber_data_2014$Year <- factor(year(uber_data_2014$`Date/Time`))
uber_data_2014$DayofWeek <- factor(wday(uber_data_2014$`Date/Time`, label = TRUE))
uber_data_2014$Hour <- factor(hour(hms(uber_data_2014$Time)))
uber_data_2014$Minute <- factor(minute(hms(uber_data_2014$Time)))
uber_data_2014$Second <- factor(second(hms(uber_data_2014$Time)))
head(uber_data_2014)
# Aggregating data based on the hour
hour_data <- uber_data_2014 %>% group_by(Hour) %>% dplyr::summarize(Total = n())
datatable(hour_data)
# Using ggplot to visualize our data
ggplot(hour_data, aes(Hour, Total)) + geom_bar(stat = "identity", fill="#f05d5d") + ggtitle("Trips Made Every Hour") + theme(legend.position = "none") + scale_y_continuous(labels = comma)
Observation: We can see that the number of trips are higher in the evening around 5:00 PM and 6:00 PM.
Plotting Data for Trips Made Every Hour by Month
month_hour_data <- uber_data_2014 %>% group_by(Month, Hour) %>% dplyr::summarize(Total = n())
ggplot(month_hour_data, aes(Hour, Total, fill = Month)) + geom_bar(stat = "identity") + ggtitle("Trips Made by Hour and Month") + scale_y_continuous(labels = comma)
day_data <- uber_data_2014 %>% group_by(Day) %>% dplyr::summarize(Total = n())
datatable(day_data)
ggplot(day_data, aes(Day, Total)) + geom_bar(stat = "identity", fill = "#0576a3") + ggtitle("Trips Made Everyday") + theme(legend.position = "none") + scale_y_continuous(labels = comma)
Oberservation: Grouping by day, we can see that the 30th had the highest number of trips.
Plotting Trips made Everyday by Month
day_month_data <- uber_data_2014 %>% group_by(Month, Day) %>% dplyr::summarize(Total = n())
ggplot(day_month_data, aes(Day, Total, fill = Month)) + geom_bar(stat = "identity") + ggtitle("Trips by Day and Month") + scale_y_continuous(labels = comma) + scale_fill_manual(values = colors)
Observation: While we noticed that 30th had the highest number of trips, this was mostly contributed by the month of April.
month_data <- uber_data_2014 %>% group_by(Month) %>% dplyr::summarize(Total = n())
datatable(month_data)
ggplot(month_data, aes(Month, Total)) + geom_bar(stat = "identity", fill = "#34baad") + ggtitle("Trips by Month") + theme(legend.position = "none") + scale_y_continuous(labels = comma) + scale_fill_manual(values = colors)
Observation: Most trips were made during the month of September.
Plotting Trips Taken During Months by Day of the Week
dayofweek_month_data <- uber_data_2014 %>% group_by(Month, DayofWeek) %>% dplyr::summarize(Total = n())
ggplot(dayofweek_month_data, aes(x = Month, y = Total, fill = DayofWeek)) + geom_bar(position = "dodge", stat = "identity") + scale_fill_manual(values = colors)
Observation: We can see that most of the trips were made in the month of September. The above plot shows the highest number of trips made on a Day of the week in that month. April had the highest number of trips made on Wednesday, May had the highest number of trips made on Friday, June had the highest number of trips made on Thursday, July had the highest number of trips made on Thursday and September had the highest number of trips made on Tuesday.
ggplot(uber_data_2014, aes(Base)) + geom_bar(fill = "#f2d56b") + scale_y_continuous(labels = comma) + ggtitle("Trips by Bases")
Observation: Base B02617 had the highest number of trips.
Plotting Number of Trips by Bases and Month
base_month_data <- uber_data_2014 %>% group_by(Month, Base) %>% dplyr::summarize(Total = n())
ggplot(base_month_data, aes(x = Base, y = Total, fill = Month)) + geom_bar(position = "dodge", stat = "identity") + scale_fill_manual(values = colors)
Observation: We can see that B02617 has the highest number of trips in the month of September.
Plotting Number of Trips by Bases and Day of the week
base_dayofweek_data <- uber_data_2014 %>% group_by(DayofWeek, Base) %>% dplyr::summarize(Total = n())
ggplot(base_dayofweek_data, aes(x = Base, y = Total, fill = DayofWeek)) + geom_bar(position = "dodge", stat = "identity") + scale_fill_manual(values = colors)
Observation: Thursday observed highest number of trips in three bases - B02598, B02617, B02682.
Creating a Heatmap Visualization of day, hour and month - to allow us to simultaenously visualize clusters of samples and features 11. Plotting heatmap by Hour and Day
day_and_hour <- uber_data_2014 %>% group_by(Day, Hour) %>% dplyr::summarize(Total = n())
datatable(day_and_hour)
ggplot(day_and_hour, aes(Day, Hour, fill = Total)) + geom_tile(color = "white") + ggtitle("Heat Map by Hour and Day")
day_and_month <- uber_data_2014 %>% group_by(Day, Month) %>% dplyr::summarize(Total = n())
ggplot(day_and_month, aes(Day, Month, fill = Total)) + geom_tile(color = "white") + ggtitle("Heat Map by Month and Day")
month_and_dayofweek <- uber_data_2014 %>% group_by(Month, DayofWeek) %>% dplyr::summarize(Total = n())
ggplot(month_and_dayofweek, aes(DayofWeek, Month, fill = Total)) + geom_tile(color = "white") + ggtitle("Heat Map by Day of Week and Month")
month_and_bases <- uber_data_2014 %>% group_by(Month, Base) %>% dplyr::summarize(Total = n())
ggplot(month_and_bases, aes(Base, Month, fill = Total)) + geom_tile(color = "white") + ggtitle("Heat Map by Month and Bases")
bases_and_dayofweek <- uber_data_2014 %>% group_by(DayofWeek, Base) %>% dplyr::summarize(Total = n())
ggplot(bases_and_dayofweek, aes(Base, DayofWeek, fill = Total)) + geom_tile(color = "white") + ggtitle("Heat Map by Bases and Day of Week")
mean_lon <- mean(uber_data_2014$Lon)
mean_lat <- mean(uber_data_2014$Lat)
# Get map of New York City
NY_map <- get_map(location = c(mean_lon, mean_lat), zoom = 10, scale = "auto", maptype = "roadmap", color = "bw")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.739261,-73.973019&zoom=10&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx
NY <- ggmap(NY_map)
NY
# Plot Uber pickup locations on the New York City map generated above based on latitude and longitude
NY_map_plots <- NY + geom_point(data = uber_data_2014, aes(x = Lon, y = Lat), color = "#f05d5d", size = 0.5, na.rm = TRUE)
NY_map_plots
# Plot Uber pickup locations on the New York City map generated above by Bases
NY_base_plot <- NY + geom_point(data = uber_data_2014, aes(x = Lon, y = Lat, color = Base), size = 0.5, na.rm = TRUE)
NY_base_plot
Summary: This project allowed us to create different visualizations using ggplot2 based on the data we have for several time frames of the year (2014). Finally, we made a geoplot of New York City that provided us with details of how various users made trips from different bases.